The data has been fetched online from Global Health Data Exchange (GHDX), which on their website is described at “the world’s most comprehensive catalog of surveys, censuses, vital statistics, and other health-related data”. GHDX is part of the Institute for Health Metrics and Evaluation (IHME), which is an independent global health research center at the University of Washington.
Concretely, the data is from the results of the 2019 Global Burden of Disease (GBD) study, and was accessed using the following link: https://vizhub.healthdata.org/gbd-results. The data was acquired the on the 9th of January 2023.
The search criterias applied are as follows: GBD Estimate: ‘Cause of death of injury’ Measure: ‘Prevalence’ Metric: ‘Number’ & ‘Percent’ & ‘Rate’ Cause: ‘Mental disorders’ Location: ‘All countries and territories” (The state ’Georgia’ of the United States of America was deselected, since otherwise it will not be possible to discern between the state and the country Georgia) Age: ‘10-24’ Sex: ‘Both’ (i.e. male and female) Year: All available years, from 1990 to 2019
# Setting my work directory to my local repository
setwd("/Users/ottosejrskildsantesson/Desktop/Cultural Data Science/Cultural Data Science Rproject/AU668917_santesson_otto/exam_project/")
# Loading in the above-mentioned data from the data folder
df = read_csv("data/raw_GBD_data.csv")## Rows: 66870 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): measure, location, sex, age, cause, metric
## dbl (4): year, val, upper, lower
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Listing the column names, so the adequate meta data can be provided (what the different column names mean)
colnames(df)## [1] "measure" "location" "sex" "age" "cause" "metric"
## [7] "year" "val" "upper" "lower"
# Loading in the map data frame object - essential for plotting data on maps
mapdata = map_data("world")
# The countries of the two data frames (df, containing the data of the prevalence of mental disorder, and mapdata, containing the geographical mappings of the country) are not fully aligned in the sense that the assigned names are different, e.g. "United States of America" in df and "USA" in mapdata. This will be rectified in an unsophisticated yet straightforward manner in the following lines of code:
df$location = ifelse(df$location == "Micronesia (Federated States of)", "Micronesia", df$location)
df$location = ifelse(df$location == "United States of America", "USA", df$location)
df$location = ifelse(df$location == "United Kingdom", "UK", df$location)
df$location = ifelse(df$location == "Republic of Korea", "South Korea", df$location)
df$location = ifelse(df$location == "Democratic People's Republic of Korea", "North Korea", df$location)
df$location = ifelse(df$location == "Czechia", "Czech Republic", df$location)
df$location = ifelse(df$location == "Micronesia (Federated States of)", "Micronesia", df$location)
df$location = ifelse(df$location == "Republic of Moldova", "Moldova", df$location)
df$location = ifelse(df$location == "Viet Nam", "Vietnam", df$location)
df$location = ifelse(df$location == "Iran (Islamic Republic of)", "Iran", df$location)
df$location = ifelse(df$location == "United Republic of Tanzania", "Tanzania", df$location)
df$location = ifelse(df$location == "Venezuela (Bolivarian Republic of)", "Venezuela", df$location)
df$location = ifelse(df$location == "Taiwan (Province of China)", "Taiwan", df$location)
df$location = ifelse(df$location == "Russian Federation", "Russia", df$location)
df$location = ifelse(df$location == "Brunei Darussalam", "Brunei", df$location)
df$location = ifelse(df$location == "Lao People's Democratic Republic", "Laos", df$location)
df$location = ifelse(df$location == "Bolivia (Plurinational State of)", "Bolivia", df$location)
df$location = ifelse(df$location == "Syrian Arab Republic", "Syria", df$location)
df$location = ifelse(df$location == "Côte d'Ivoire", "Ivory Coast", df$location)
df$location = ifelse(df$location == "Cabo Verde", "Cape Verde", df$location)
mapdata$region = ifelse(mapdata$region == "Swaziland", "Eswatini", mapdata$region)
mapdata$region = ifelse(mapdata$region == "Trinidad" | mapdata$region == "Tobago" , "Trinidad and Tobago", mapdata$region)
mapdata$region = ifelse(mapdata$region == "Antigua" | mapdata$region == "Barbuda" , "Antigua and Barbuda", mapdata$region)
mapdata$region = ifelse(mapdata$region == "Saint Vincent" | mapdata$region == "Grenadines" , "Saint Vincent and the Grenadines", mapdata$region)
mapdata$region = ifelse(mapdata$region == "Saint Kitts" | mapdata$region == "Nevis" , "Saint Kitts and Nevis", mapdata$region)
mapdata$region = ifelse(mapdata$region == "Virgin Islands" & grepl("US", mapdata$subregion) == TRUE , "United States Virgin Islands", mapdata$region)
# Filtering in the df, since we only need the metric of 'Percentage' for the data visualization:
df_filtered = df %>% filter(metric == "Percent")
# Changing the 'val' column (percentage of 10-24 year olds with mental disorder) from being represented in decimals to percentage
df_filtered$val = df_filtered$val*100
# Filtering the df, so it only includes countries/regions that are in the 'mapdata' dataframe (thus filtering out all of the redundant regions, such as the individual states of USA)
df_filtered = df_filtered[df_filtered$location %in% mapdata$region,]
# Now joining the two data frames together, so it is ready to be plotted
joined_frame = left_join(mapdata, df_filtered, by = c("region" = "location"))
# Filtering the data frame, so we only include the countries/regions that we have data on:
filtered_joined_frame = joined_frame %>% filter(!is.na(joined_frame$val))# So, if we plotted the data from filtered_joined_frame on a map, then there would be a blank spot where Kosovo is supposed to be, since the raw data doesn't include any data on Kosovo. This is preferably avoided, so we construct a data frame for Kosovo that is compatible with the other data:
# Starting off by filtering, so we only get rows where the region is "Kosovo"
joined_frame_kosovo = joined_frame %>% filter(region == "Kosovo")
# We make a variable that is equal to the length of the Kosovo-filtered data frame, which is gonna be used in a moment
rep_number = nrow(joined_frame_kosovo)
# We now make a vector with the different years
years <- seq(from = 1990, to = 2019)
# We now extend the Kosovo-filtered data frame by repeating the rows with the number of years
joined_frame_kosovo <- joined_frame_kosovo[rep(1:nrow(joined_frame_kosovo), each = length(years)), ]
# We add on the values for 'year' column
joined_frame_kosovo$year = rep(years, rep_number)
# And finally, we add the data frame to the main data frame defined in the previous chunk
filtered_joined_frame = rbind(filtered_joined_frame, joined_frame_kosovo)# Preparing the data frame for Denmark is very simple - we simply just specify the region to be "Denmark"
filtered_joined_frame_dk = filtered_joined_frame %>% filter(region == "Denmark")
# To prepare the data frame for the visualization of the European countries, we first start of by defining a vector with the European countries
european_countries = c("Albania", "Andorra", "Armenia", "Austria", "Azerbaijan",
"Belarus", "Belgium", "Bosnia and Herzegovina", "Bulgaria", "Croatia",
"Cyprus", "Czech Republic", "Denmark", "Estonia", "Finland", "France",
"Georgia", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy",
"Kazakhstan", "Kosovo", "Latvia", "Liechtenstein", "Lithuania", "Luxembourg",
"North Macedonia", "Malta", "Moldova", "Monaco", "Montenegro", "Netherlands",
"Norway", "Poland", "Portugal", "Romania", "San Marino", "Serbia",
"Slovakia", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine",
"UK", "Vatican City", "Russia")
# We then filter the main data frame, by specifying that the region has to be in the vecotr of European countries specified above
filtered_joined_frame_eu = filtered_joined_frame %>% filter(region %in% european_countries)
# To prepare the data frame for the data vizualisation of all of the world countries, nothing has to be done - the data frame is simply 'filtered_joined_frame'# Initializing 'gganimate' an R package that allows us to animate our ggplots, so we can see the development over time. We also need to initialize another package in order for it to run:
pacman::p_load(gganimate,transformr, gifski, png)
animated_map_plot_dk = ggplot(filtered_joined_frame_dk) + # Initializing the basis of the ggplot
geom_polygon(aes(x = long, y = lat, group = group, fill = val), color = "black") + # Filling out with respect to the percent of mental disorders
transition_time(year) + # Needed for the animation of the plot - telling it, that it should animate with respect to 'year'
scale_fill_gradient(name = "% mental disorder", low = "#FFF04D", high = "red", na.value = "grey50") + # Defining the gradient for the fill color
theme(axis.text.x = element_blank(), # Making the plot more neat, by removing uninformative graphic settings
axis.text.y = element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank())+
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Denmark") + # Setting up the title
labs(subtitle = "Year: {round(frame_time,0)}") # Setting up a subtitle that is matched with what year it is
# Plotting the plot
animated_map_plot_dk# A different way to visualize the data in an animated fashion; using a barplot
animated_bar_plot_dk = df_filtered %>%
filter(location == "Denmark") %>%
ggplot(., aes(location, val)) +
transition_time(year) +
geom_col(fill = "red", color = "white", width = 0.25) +
guides(fill = FALSE) +
scale_y_continuous(breaks = seq(from = 0, to = max(18), by = 1)) +
ylab("Percentage") +
theme(axis.title.x = element_blank()) +
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Denmark") +
labs(subtitle = "Year: {round(frame_time,0)}")## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
animated_bar_plot_dkanimated_map_plot_eu = ggplot(filtered_joined_frame_eu) +
geom_polygon(aes(x = long, y = lat, group = group, fill = val), color = "black") + # Filling out the countries in respect to the percent of mental disorders
transition_time(year) +
scale_fill_gradient(name = "% mental disorder", low = "#FFF04D", high = "red", na.value = "grey50") +
coord_cartesian(xlim = c(-25, 45), ylim = c(32,72)) + # Cropping the map, so it looks neater
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank())+
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Europe") +
labs(subtitle = "Year: {round(frame_time,0)}")
animated_map_plot_euanimated_bar_plot_eu = df_filtered %>%
filter(location %in% european_countries) %>%
ggplot(., aes(location, val)) +
transition_time(year) +
geom_col(fill = "#001489", color = "white") +
guides(fill = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text.x = element_text(size = rel(0.90))) +
scale_y_continuous(breaks = seq(from = 0, to = 28, by = 1)) +
ylab("Percentage") +
theme(axis.title.x = element_blank()) +
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Europe") +
labs(subtitle = "Year: {round(frame_time,0)}")
animated_bar_plot_euanimated_map_plot_world = ggplot(filtered_joined_frame) +
geom_polygon(aes(x = long, y = lat, group = group, fill = val), color = "black") +
transition_time(year) +
scale_fill_gradient(name = "% mental disorder", low = "#FFF04D", high = "red", na.value = "grey50") +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank())+
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, World") +
labs(subtitle = "Year: {round(frame_time,0)}")
animated_map_plot_world# Since it is all of the worlds countries that have to fit on the x-axis, the plot is going to be very cluttered.
animated_bar_plot_world = df_filtered %>%
filter(location %in% joined_frame$region) %>%
ggplot(., aes(location, val)) +
transition_time(year) +
geom_col(fill = "#242526", color = "white") +
guides(fill = FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(axis.text.x = element_text(size = rel(0.5))) +
scale_y_continuous(breaks = seq(from = 0, to = 28, by = 1)) +
ylab("Percentage") +
theme(axis.title.x = element_blank()) +
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, World") +
labs(subtitle = "Year: {round(frame_time,0)}")
animated_bar_plot_world# Since it is not possible to show animated plots on a PDF, I will make some additional (still) plots to put into the assignment:
# A plot showing the development of the percentage of 10-24 year olds with mental disorder in Denmark from 1990 to 2019
ggplot(filtered_joined_frame_dk, aes(year, val, color = val)) + # Mapping 'color' to 'val', so we can visualize the development with colors as well
geom_line()+ # Using the line geom to visualize the data
scale_color_gradient(low = "#FF8308", high = "#C6080C") + # Defining the gradient color
scale_x_continuous(breaks = seq(from = 1990, to = 2019, by = 1))+ # Making some more ticks on the x-axis, so the plot is more legible
theme(axis.text.x = element_text(angle = 45, hjust = 1))+ # Adjusting the angels of the labels on the x-axis
coord_cartesian(xlim = c(1991.28, 2017.72))+ # Hardcoding the limits of the x-axis, so it is exactly how we want it (otherwise, there would be some ugly spacing)
guides(color = FALSE) + # Removing the legend, since it is redundant
xlab("Year") + # Renaming the x- and y-axis
ylab("Percentage")+
ggtitle("Percentage of mental disorders among 10-24 year olds in Denmark, from 1990 to 2019")+ # Defining our plot title
theme(plot.title = element_text(size = rel(1.05))) # And finally adjusting the size of it - otherwise, the title wouldn't fit the plot# Making the map plots from before, but only with the 2019 data:
# Denmark
ggplot(filter(filtered_joined_frame_dk, year == 2019)) +
geom_polygon(aes(x = long, y = lat, group = group, fill = round(val,2)), color = "black") +
scale_fill_gradient(name = "% mental disorder", limits = c(16,18), low = "#FFF04D", high = "red", na.value = "grey50") +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank())+
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Denmark") +
labs(subtitle = "Year: 2019")# Europe
ggplot(filter(filtered_joined_frame_eu, year == 2019)) +
geom_polygon(aes(x = long, y = lat, group = group, fill = round(val,2)), color = "black") +
scale_fill_gradient(name = "% mental disorder", low = "#FFF04D", high = "red", na.value = "grey50") +
coord_cartesian(xlim = c(-25, 45), ylim = c(32,72)) +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank())+
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Europe") +
labs(subtitle = "Year: 2019")# World
ggplot(filter(filtered_joined_frame, year == 2019)) +
geom_polygon(aes(x = long, y = lat, group = group, fill = round(val,2)), color = "black") +
scale_fill_gradient(name = "% mental disorder", low = "#FFF04D", high = "red", na.value = "grey50") +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y=element_blank(),
axis.title.x=element_blank())+
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, World") +
labs(subtitle = "Year: 2019")# Making the bar plots from before, but only with the 2019 data:
# Europe
df_filtered %>%
filter(location %in% european_countries) %>%
filter(year == 2019) %>%
ggplot(., aes(location, val)) +
geom_col(fill = "#001489") +
guides(fill = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text.x = element_text(size = rel(0.90))) +
ylab("Percentage") +
theme(axis.title.x = element_blank()) +
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, Europe") +
labs(subtitle = "Year: 2019")# World
# Be aware - since it is all of the countries of the world, the plot is going to be very cluttered. There is no way around it
df_filtered %>%
filter(location %in% joined_frame$region) %>%
filter(year == 2019) %>%
ggplot(., aes(location, val)) +
geom_col(fill = "#242526", color = "white") +
guides(fill = FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(axis.text.x = element_text(size = rel(0.5))) +
ylab("Percentage") +
theme(axis.title.x = element_blank()) +
labs(title = "Proportion of 10-24 year olds, who have a mental disorder, World") +
labs(subtitle = "Year: 2019")